package com.qingyun.service.compare;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * 提取每页的摘要信息（指纹）
 */
public class PageFingerprinter {
    public static List<String> extractPageSummaries(PDDocument document) throws IOException, IOException {
        List<String> summaries = new ArrayList<>();
        for (int i = 0; i < document.getNumberOfPages(); i++) {
            PDPage page = document.getPage(i);
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setStartPage(i + 1);
            stripper.setEndPage(i + 1);
            String text = stripper.getText(document);
            // 去掉回车符、空格、换行符、换页符、缩进符、'\r'
            text = text.replaceAll("\\s+", "");
            // 取前 100 字作为指纹（可根据需要扩展）
            //String summary = text.length() > 500 ? text.substring(0, 500) : text;
            summaries.add(text);
        }
        return summaries;
    }
}
